# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymongo
import datetime
from scrapy.conf import settings

class HeartPipeline(object):
    def process_item(self, item, spider):
        return item


class MongoPipeline(object):
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB')
        )

    def open_spider(self, spider):
        print("开始爬取", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def process_item(self, item, spider):
        data = self.db[item.collection].find_one({"article_id": item["article_id"]})

        if data is None:
            self.db[item.collection].insert(dict(item))
        else:
            try:
                self.db[item.collection].update({"article_id": item["article_id"]}, {"$addToSet":{"tag."+item["tag1"]:item["tag"][item["tag1"]][0]}})
            finally:
                return item
        return item

    def close_spider(self, spider):
        self.lastTime = self.db[settings['MONGO_TIME_COLL']]
        last_times = self.lastTime.count({})
        self.lastTime.insert({"times": last_times + 1, "date": datetime.datetime.now().strftime('%Y.%m.%d')})
        print("爬取结束", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        self.client.close()
